This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
#spotify[cols_to_convert] <- lapply(spotify[cols_to_convert], function(x) as.numeric(as.character(x)))
#spotify$streams <- as.numeric(spotify$streams)
spotify$in_deezer_playlists = as.numeric(spotify$in_deezer_playlists)
G2;H2;Warningh: NAs introduced by coerciong
spotify$in_shazam_charts = as.numeric(spotify$in_shazam_charts)
G2;H2;Warningh: NAs introduced by coerciong
str(spotify[, 3:14])
'data.frame': 953 obs. of 12 variables:
$ artist_count : int 2 1 1 1 1 2 2 1 1 2 ...
$ released_year : int 2023 2023 2023 2019 2023 2023 2023 2023 2023 2023 ...
$ released_month : int 7 3 6 8 5 6 3 7 5 3 ...
$ released_day : int 14 23 30 23 18 1 16 7 15 17 ...
$ in_spotify_playlists: int 553 1474 1397 7858 3133 2186 3090 714 1096 2953 ...
$ in_spotify_charts : int 147 48 113 100 50 91 50 43 83 44 ...
$ streams : num 1.41e+08 1.34e+08 1.40e+08 8.01e+08 3.03e+08 ...
$ in_apple_playlists : int 43 48 94 116 84 67 34 25 60 49 ...
$ in_apple_charts : int 263 126 207 207 133 213 222 89 210 110 ...
$ in_deezer_playlists : num 45 58 91 125 87 88 43 30 48 66 ...
$ in_deezer_charts : int 10 14 14 12 15 17 13 13 11 13 ...
$ in_shazam_charts : num 826 382 949 548 425 946 418 194 953 339 ...
pairs(spotify[, 3:14], main = "Linear Relationships Between Metrics")
summary(spotify)
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts
Length:953 Length:953 Min. :1.000 Min. :1930 Min. : 1.000 Min. : 1.00 Min. : 31 Min. : 0.00
Class :character Class :character 1st Qu.:1.000 1st Qu.:2020 1st Qu.: 3.000 1st Qu.: 6.00 1st Qu.: 875 1st Qu.: 0.00
Mode :character Mode :character Median :1.000 Median :2022 Median : 6.000 Median :13.00 Median : 2224 Median : 3.00
Mean :1.556 Mean :2018 Mean : 6.034 Mean :13.93 Mean : 5200 Mean : 12.01
3rd Qu.:2.000 3rd Qu.:2022 3rd Qu.: 9.000 3rd Qu.:22.00 3rd Qu.: 5542 3rd Qu.: 16.00
Max. :8.000 Max. :2023 Max. :12.000 Max. :31.00 Max. :52898 Max. :147.00
streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm key
Min. :2.762e+03 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 65.0 Length:953
1st Qu.:1.416e+08 1st Qu.: 13.00 1st Qu.: 7.00 1st Qu.: 12.0 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:100.0 Class :character
Median :2.905e+08 Median : 34.00 Median : 38.00 Median : 36.5 Median : 0.000 Median : 2.00 Median :121.0 Mode :character
Mean :5.141e+08 Mean : 67.81 Mean : 51.91 Mean :109.7 Mean : 2.666 Mean : 51.18 Mean :122.5
3rd Qu.:6.739e+08 3rd Qu.: 88.00 3rd Qu.: 87.00 3rd Qu.:110.0 3rd Qu.: 2.000 3rd Qu.: 36.00 3rd Qu.:140.0
Max. :3.704e+09 Max. :672.00 Max. :275.00 Max. :974.0 Max. :58.000 Max. :953.00 Max. :206.0
NA's :1 NA's :79 NA's :57
mode danceability_. valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_.
Length:953 Min. :23.00 Min. : 4.00 Min. : 9.00 Min. : 0.00 Min. : 0.000 Min. : 3.00 Min. : 2.00
Class :character 1st Qu.:57.00 1st Qu.:32.00 1st Qu.:53.00 1st Qu.: 6.00 1st Qu.: 0.000 1st Qu.:10.00 1st Qu.: 4.00
Mode :character Median :69.00 Median :51.00 Median :66.00 Median :18.00 Median : 0.000 Median :12.00 Median : 6.00
Mean :66.97 Mean :51.43 Mean :64.28 Mean :27.06 Mean : 1.581 Mean :18.21 Mean :10.13
3rd Qu.:78.00 3rd Qu.:70.00 3rd Qu.:77.00 3rd Qu.:43.00 3rd Qu.: 0.000 3rd Qu.:24.00 3rd Qu.:11.00
Max. :96.00 Max. :97.00 Max. :97.00 Max. :97.00 Max. :91.000 Max. :97.00 Max. :64.00
colSums(is.na(spotify))
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists
0 0 0 0 0 0 0
in_spotify_charts streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts
0 1 0 0 79 0 57
bpm key mode danceability_. valence_. energy_. acousticness_.
0 0 0 0 0 0 0
instrumentalness_. liveness_. speechiness_.
0 0 0
dim(spotify)
[1] 953 24
library(ggplot2)
ggplot(spotify, aes(x = released_year)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "white") +
labs(title = "Distribution of Streams", x = names(spotify$released_year))
plot(density(spotify$released_year, na.rm = TRUE), main = "Density Plot of Released Year", xlab = "Released Year", col = "blue", lwd = 2)
View(spotify)
# Basic scatter plot with color based on 'mode'
ggplot(spotify, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
library(shiny)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Streams vs Spotify Playlists by Mode"),
sidebarLayout(
sidebarPanel(
checkboxGroupInput("mode_select", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("scatterPlot")
)
)
)
# Server
server <- function(input, output) {
output$scatterPlot <- renderPlot({
filtered_data <- spotify[spotify$mode %in% input$mode_select, ]
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(
title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists"
) +
theme_minimal()
})
}
# Run the app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
library(dplyr)
# Create a combined label of Song + Artist
spotify <- spotify %>%
mutate(song.artist = paste(track_name, "-", artist.s._name))
yearly_top_song <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 1, with_ties = TRUE) %>%
ungroup()
# Step 1: Get the top 10 songs by total streams
top10_yearly <- yearly_top_song %>%
arrange(desc(streams)) %>%
slice(1:10)
top10_yearly
# Convert song_artist to factor with levels ordered by Streams
top10_yearly <- top10_yearly %>%
arrange(desc(streams)) %>%
mutate(song.artist = factor(song.artist, levels = unique(song.artist)))
top10_yearly
ggplot(top10_yearly, aes(x = released_year, y = streams, fill = factor(song.artist))) +
geom_bar(stat = "identity") +
labs(title = "Top Streamed Songs per Year",
x = "Track (Song - Artist)",
y = "Number of Streams",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(top10_yearly, aes(x = factor(released_year), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Streamed Songs By Year",
x = "Year",
y = "Number of Streams",
fill = "Song-Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
yearly_top_songs <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 5, with_ties = TRUE) %>%
ungroup()
View(yearly_top_songs)
# Filter for 2023 top 5 songs from your previously filtered data
top_2023 <- yearly_top_songs %>%
filter(released_year == 2023)
# Create the bar chart
ggplot(top_2023, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Streamed Songs in 2023",
x = "Song",
y = "Number of Streams",
fill = "Song & Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Check how many rows are in the dataset for 2022
dim(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
[1] 5 25
# Check for NA or invalid values in 2022
summary(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts
Length:5 Length:5 Min. :1.0 Min. :2022 Min. :3.0 Min. : 6.0 Min. : 8506 Min. : 42.0
Class :character Class :character 1st Qu.:1.0 1st Qu.:2022 1st Qu.:5.0 1st Qu.: 6.0 1st Qu.: 8576 1st Qu.: 42.0
Mode :character Mode :character Median :2.0 Median :2022 Median :5.0 Median : 6.0 Median : 8870 Median : 43.0
Mean :1.6 Mean :2022 Mean :5.8 Mean :14.2 Mean :11713 Mean : 60.4
3rd Qu.:2.0 3rd Qu.:2022 3rd Qu.:7.0 3rd Qu.:22.0 3rd Qu.: 9037 3rd Qu.: 45.0
Max. :2.0 Max. :2022 Max. :9.0 Max. :31.0 Max. :23575 Max. :130.0
streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm key
Min. :1.231e+09 Min. : 94.0 Min. : 65.0 Min. :139.0 Min. :14.0 Min. : 49.0 Min. : 92.0 Length:5
1st Qu.:1.264e+09 1st Qu.:104.0 1st Qu.:108.0 1st Qu.:141.0 1st Qu.:14.0 1st Qu.:127.8 1st Qu.:107.0 Class :character
Median :1.357e+09 Median :124.0 Median :120.0 Median :164.0 Median :26.0 Median :160.0 Median :128.0 Mode :character
Mean :1.561e+09 Mean :188.2 Mean :124.8 Mean :327.6 Mean :25.2 Mean :136.2 Mean :126.4
3rd Qu.:1.441e+09 3rd Qu.:216.0 3rd Qu.:133.0 3rd Qu.:331.0 3rd Qu.:26.0 3rd Qu.:168.5 3rd Qu.:131.0
Max. :2.513e+09 Max. :403.0 Max. :198.0 Max. :863.0 Max. :46.0 Max. :176.0 Max. :174.0
NA's :1
mode danceability_. valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_. song.artist
Length:5 Min. :52.0 Min. :19.0 Min. :47.0 Min. : 1 Min. :0.0 Min. : 9.0 Min. : 4.0 Length:5
Class :character 1st Qu.:62.0 1st Qu.:24.0 1st Qu.:71.0 1st Qu.: 1 1st Qu.:0.0 1st Qu.:13.0 1st Qu.: 6.0 Class :character
Mode :character Median :65.0 Median :43.0 Median :72.0 Median : 9 Median :0.0 Median :23.0 Median : 8.0 Mode :character
Mean :68.2 Mean :41.4 Mean :68.2 Mean :11 Mean :0.6 Mean :20.6 Mean :10.4
3rd Qu.:71.0 3rd Qu.:55.0 3rd Qu.:73.0 3rd Qu.:10 3rd Qu.:0.0 3rd Qu.:27.0 3rd Qu.: 9.0
Max. :91.0 Max. :66.0 Max. :78.0 Max. :34 Max. :3.0 Max. :31.0 Max. :25.0
# Alternatively, print it to inspect
print(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
# Look at 2022 data closely
spotify %>%
filter(released_year == 2022) %>%
select(track_name, artist.s._name, streams) %>%
glimpse()
Rows: 402
Columns: 3
$ track_name <chr> "As It Was", "Kill Bill", "Calm Down (with Selena Gomez)", "Creepin'", "Anti-Hero", "I'm Good (Blue)", "I Ain't Worried", "La Ba…
$ artist.s._name <chr> "Harry Styles", "SZA", "R��ma, Selena G", "The Weeknd, 21 Savage, Metro Boomin", "Taylor Swift", "Bebe Rexha, David Guetta", "On…
$ streams <dbl> 2513188493, 1163093654, 899183384, 843957510, 999748277, 1109433169, 1085685420, 1214083358, 720434240, 674072710, 404562836, 37…
# Count unique songs to see if there's a tie issue
yearly_top_songs %>%
filter(released_year == 2022) %>%
count(track_name)
# Check for NAs in streams or grouping variables
yearly_top_songs %>%
filter(released_year == 2022) %>%
summarise(
missing_streams = sum(is.na(streams)),
missing_track = sum(is.na(track_name)),
missing_artist = sum(is.na(song.artist)))
# Shiny app to view top streamed songs by year with a toggle
library(shiny)
library(dplyr)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Top Streamed Songs by Year"),
sidebarLayout(
sidebarPanel(
selectInput("year", "Select Year:", choices = sort(unique(yearly_top_songs$released_year)))
),
mainPanel(
plotOutput("topSongsPlot")
)
)
)
# Server
server <- function(input, output, session) {
output$topSongsPlot <- renderPlot({
selected_year_data <- yearly_top_songs %>%
filter(released_year == input$year)
# Ensure no invalid characters or encoding issues in track names
selected_year_data$track_name <- iconv(selected_year_data$track_name, from = "UTF-8", to = "UTF-8", sub = "*")
ggplot(selected_year_data, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = paste("Top 5 Streamed Songs in", input$year),
x = "Song",
y = "Number of Streams",
fill = "Song & Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
}
# Run app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
# Select only numeric columns
numeric_cols <- spotify %>%
select(where(is.numeric))
numeric_cols
# Calculate correlation of all numeric columns with 'streams'
correlations <- cor(numeric_cols, use = "complete.obs")
correlations
artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts streams in_apple_playlists
artist_count 1.000000000 0.061445644 0.009720347 -0.044766245 -0.0746868039 -0.002421656 -0.1090468634 -0.008712241
released_year 0.061445644 1.000000000 0.031372926 0.160042169 -0.3305741123 0.100988420 -0.1483509269 -0.155648773
released_month 0.009720347 0.031372926 1.000000000 -0.015820232 -0.0187633639 -0.031526077 0.0413240023 0.007380967
released_day -0.044766245 0.160042169 -0.015820232 1.000000000 -0.0320967227 0.042203010 0.0410748362 0.028622345
in_spotify_playlists -0.074686804 -0.330574112 -0.018763364 -0.032096723 1.0000000000 0.173307807 0.7650951338 0.709922084
in_spotify_charts -0.002421656 0.100988420 -0.031526077 0.042203010 0.1733078066 1.000000000 0.2454749140 0.213322335
streams -0.109046863 -0.148350927 0.041324002 0.041074836 0.7650951338 0.245474914 1.0000000000 0.663657168
in_apple_playlists -0.008712241 -0.155648773 0.007380967 0.028622345 0.7099220838 0.213322335 0.6636571679 1.000000000
in_apple_charts -0.079655066 0.007650642 -0.010603129 0.009855360 0.2087053786 0.565321488 0.2508103705 0.322358302
in_deezer_playlists -0.073406225 -0.265234545 -0.035433532 -0.041980554 0.7880546875 0.151785663 0.7185929567 0.645914528
in_deezer_charts 0.022218537 0.103287112 -0.001921194 0.063555630 0.1952907401 0.558419963 0.2594696320 0.409688235
in_shazam_charts -0.031812269 0.054492378 -0.090799317 0.040728906 0.1111503800 0.594678886 0.0587456970 0.187401561
bpm -0.067047448 -0.041957657 -0.051936323 -0.048020996 0.0260085534 0.028010413 0.0327164251 0.044415122
danceability_. 0.209581804 0.192054100 -0.034978955 0.076211130 -0.1066197808 0.075249362 -0.0754316227 0.011504320
valence_. 0.120784211 -0.064812792 -0.118074232 0.071279071 -0.0552336199 0.056602171 -0.0584550791 0.053187299
energy_. 0.149966302 0.130105474 -0.081977712 0.064572106 -0.0494256700 0.104328458 -0.0496657926 0.074416649
acousticness_. -0.101620287 -0.169751059 0.039266560 -0.010279631 0.0001543819 -0.078095007 0.0013286969 -0.088265650
instrumentalness_. -0.052814944 -0.014754771 0.031122232 0.007126726 0.0121080272 -0.012565007 -0.0009670221 -0.045488723
liveness_. 0.041035230 0.007441171 -0.017825352 0.002619619 -0.0339739648 -0.039153639 -0.0387277529 -0.046255149
speechiness_. 0.117955768 0.126711891 0.030599526 -0.017347379 -0.0719087372 -0.086192083 -0.0907281501 -0.101941835
in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm danceability_. valence_. energy_. acousticness_.
artist_count -0.079655066 -0.073406225 0.022218537 -0.031812269 -0.0670474482 0.209581804 0.120784211 0.149966302 -0.1016202867
released_year 0.007650642 -0.265234545 0.103287112 0.054492378 -0.0419576571 0.192054100 -0.064812792 0.130105474 -0.1697510593
released_month -0.010603129 -0.035433532 -0.001921194 -0.090799317 -0.0519363227 -0.034978955 -0.118074232 -0.081977712 0.0392665600
released_day 0.009855360 -0.041980554 0.063555630 0.040728906 -0.0480209963 0.076211130 0.071279071 0.064572106 -0.0102796313
in_spotify_playlists 0.208705379 0.788054688 0.195290740 0.111150380 0.0260085534 -0.106619781 -0.055233620 -0.049425670 0.0001543819
in_spotify_charts 0.565321488 0.151785663 0.558419963 0.594678886 0.0280104129 0.075249362 0.056602171 0.104328458 -0.0780950070
streams 0.250810371 0.718592957 0.259469632 0.058745697 0.0327164251 -0.075431623 -0.058455079 -0.049665793 0.0013286969
in_apple_playlists 0.322358302 0.645914528 0.409688235 0.187401561 0.0444151222 0.011504320 0.053187299 0.074416649 -0.0882656498
in_apple_charts 1.000000000 0.198692411 0.356675982 0.443346418 0.0512089175 -0.003976097 0.061427394 0.153590558 -0.1050831002
in_deezer_playlists 0.198692411 1.000000000 0.218281108 0.135919298 0.0453831408 -0.104850821 -0.025849620 -0.028605485 0.0288379089
in_deezer_charts 0.356675982 0.218281108 1.000000000 0.374829138 0.0370517105 0.087187954 0.075155386 0.108571701 -0.0439176997
in_shazam_charts 0.443346418 0.135919298 0.374829138 1.000000000 0.0891578410 -0.010179394 -0.003080391 0.095095549 -0.0716735649
bpm 0.051208918 0.045383141 0.037051710 0.089157841 1.0000000000 -0.140710959 0.050484657 0.003536259 -0.0020473755
danceability_. -0.003976097 -0.104850821 0.087187954 -0.010179394 -0.1407109592 1.000000000 0.390335848 0.186243358 -0.2390078796
valence_. 0.061427394 -0.025849620 0.075155386 -0.003080391 0.0504846571 0.390335848 1.000000000 0.354253808 -0.0680708838
energy_. 0.153590558 -0.028605485 0.108571701 0.095095549 0.0035362587 0.186243358 0.354253808 1.000000000 -0.5547718398
acousticness_. -0.105083100 0.028837909 -0.043917700 -0.071673565 -0.0020473755 -0.239007880 -0.068070884 -0.554771840 1.0000000000
instrumentalness_. -0.010658818 0.021617457 -0.002299823 -0.015732282 -0.0009552758 -0.098154216 -0.136058212 -0.032914831 0.0332206982
liveness_. -0.001551996 -0.005142997 0.002914949 -0.045209630 0.0005645641 -0.093272303 0.016319569 0.120967010 -0.0406689579
speechiness_. -0.157645853 -0.108361699 -0.073955127 -0.081685578 0.0247134810 0.173420342 0.036580343 -0.017125796 -0.0238770164
instrumentalness_. liveness_. speechiness_.
artist_count -0.0528149443 0.0410352297 0.11795577
released_year -0.0147547713 0.0074411712 0.12671189
released_month 0.0311222324 -0.0178253521 0.03059953
released_day 0.0071267258 0.0026196188 -0.01734738
in_spotify_playlists 0.0121080272 -0.0339739648 -0.07190874
in_spotify_charts -0.0125650073 -0.0391536392 -0.08619208
streams -0.0009670221 -0.0387277529 -0.09072815
in_apple_playlists -0.0454887232 -0.0462551494 -0.10194183
in_apple_charts -0.0106588177 -0.0015519961 -0.15764585
in_deezer_playlists 0.0216174566 -0.0051429975 -0.10836170
in_deezer_charts -0.0022998235 0.0029149486 -0.07395513
in_shazam_charts -0.0157322822 -0.0452096305 -0.08168558
bpm -0.0009552758 0.0005645641 0.02471348
danceability_. -0.0981542162 -0.0932723026 0.17342034
valence_. -0.1360582123 0.0163195694 0.03658034
energy_. -0.0329148310 0.1209670100 -0.01712580
acousticness_. 0.0332206982 -0.0406689579 -0.02387702
instrumentalness_. 1.0000000000 -0.0488636800 -0.08664221
liveness_. -0.0488636800 1.0000000000 -0.04518074
speechiness_. -0.0866422067 -0.0451807367 1.00000000
# Extract just the correlations with 'streams'
cor_with_streams <- correlations["streams", ]
cor_with_streams
artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts streams
-0.1090468634 -0.1483509269 0.0413240023 0.0410748362 0.7650951338 0.2454749140 1.0000000000
in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm danceability_.
0.6636571679 0.2508103705 0.7185929567 0.2594696320 0.0587456970 0.0327164251 -0.0754316227
valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_.
-0.0584550791 -0.0496657926 0.0013286969 -0.0009670221 -0.0387277529 -0.0907281501
# Sort and view
sort(cor_with_streams, decreasing = TRUE)
streams in_spotify_playlists in_deezer_playlists in_apple_playlists in_deezer_charts in_apple_charts in_spotify_charts
1.0000000000 0.7650951338 0.7185929567 0.6636571679 0.2594696320 0.2508103705 0.2454749140
in_shazam_charts released_month released_day bpm acousticness_. instrumentalness_. liveness_.
0.0587456970 0.0413240023 0.0410748362 0.0327164251 0.0013286969 -0.0009670221 -0.0387277529
energy_. valence_. danceability_. speechiness_. artist_count released_year
-0.0496657926 -0.0584550791 -0.0754316227 -0.0907281501 -0.1090468634 -0.1483509269
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists + danceability_. + energy_. + valence_., data = spotify)
summary(model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists + danceability_. + energy_. + valence_.,
data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.068e+09 -1.246e+08 -3.334e+07 9.771e+07 1.301e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 182491364 45594165 4.003 6.80e-05 ***
in_spotify_playlists 42130 3557 11.844 < 2e-16 ***
in_deezer_playlists 566032 74514 7.596 7.90e-14 ***
in_apple_playlists 1507560 189228 7.967 5.11e-15 ***
danceability_. 223598 592024 0.378 0.7058
energy_. -717839 509189 -1.410 0.1590
valence_. -637129 379394 -1.679 0.0934 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 229500000 on 866 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.6886, Adjusted R-squared: 0.6864
F-statistic: 319.2 on 6 and 866 DF, p-value: < 2.2e-16
# Refitting the model with only significant predictors
refined_model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = spotify)
# Summary of the refined model
summary(refined_model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists, data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.082e+09 -1.173e+08 -3.519e+07 9.715e+07 1.311e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 119309688 10537578 11.322 < 2e-16 ***
in_spotify_playlists 43264 3526 12.271 < 2e-16 ***
in_deezer_playlists 565426 74598 7.580 8.88e-14 ***
in_apple_playlists 1427508 186160 7.668 4.67e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.3e+08 on 869 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.686, Adjusted R-squared: 0.6849
F-statistic: 632.9 on 3 and 869 DF, p-value: < 2.2e-16
# Refit the model with complete cases only
data_complete <- spotify %>%
select(streams, in_spotify_playlists, in_deezer_playlists, in_apple_playlists) %>%
na.omit()
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = data_complete)
# Add predictions to the complete data
# Predict streams
predicted_streams <- predict(model, newdata = data_complete)
# Add predictions to the data frame
data_complete$predicted_streams <- predicted_streams
data_complete
# Plot actual vs predicted
ggplot(data_complete, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted Streams",
x = "Actual Streams",
y = "Predicted Streams") +
theme_minimal()
# Plot residuals
residuals <- model$residuals
View(data_complete)
ggplot(data_complete, aes(x = predicted_streams, y = residuals)) +
geom_point(alpha = 0.6, color = "darkorange") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residual Plot",
x = "Predicted Streams",
y = "Residuals") +
theme_minimal()
# Step 1: Load required libraries
library(caret)
# Step 2: Set seed for reproducibility
set.seed(123)
# Step 3: Define training control for 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Step 4: Define the model formula (same predictors as before)
model_formula <- streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists
# Step 5: Fit the linear regression model using caret::train()
cv_model <- train(
model_formula,
data = data_complete,
method = "lm",
trControl = train_control
)
# Step 6: Review cross-validation results
print(cv_model)
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Optional: Plot predictions vs. actuals again using cv_model$finalModel if desired
# Fit the final model on full data
final_model <- train(
streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists,
data = data_complete,
method = "lm"
)
# View final coefficients
coef(final_model$finalModel)
(Intercept) in_spotify_playlists in_deezer_playlists in_apple_playlists
119309687.75 43263.74 565425.67 1427508.47
#Create a new data frame with predictor values
# Replace these numbers with your actual input values
new_input <- data.frame(
in_spotify_playlists = 2500,
in_deezer_playlists = 50,
in_apple_playlists = 250
)
# 3. Predict streams based on new inputs
predicted_streams <- predict(cv_model, newdata = new_input)
# View prediction
predicted_streams
1
612617449
# Load required packages
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Predict Song Streams"),
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(cv_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
?cv_model
No documentation for ‘cv_model’ in specified packages and libraries:
you could try ‘??cv_model’
cv_model
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Assume `final_model` is already trained with lm()
colSums(is.na(data_complete))
streams in_spotify_playlists in_deezer_playlists in_apple_playlists predicted_streams
0 0 0 0 0
# Then you need to extract the final linear model from the `train` object
# before using it for prediction with confidence and prediction intervals
lm_model <- cv_model$finalModel
# Now you can safely use predict with interval = "confidence" and "prediction"
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
)
# Plot
ggplot(plot_data, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Actual Streams",
y = "Predicted Streams"
) +
theme_minimal()
# First, ensure that you have your predictions with intervals set up properly
# Extract the linear model from caret's train object
lm_model <- cv_model$finalModel
# Generate predictions with both confidence and prediction intervals
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
) %>%
arrange(streams) # sort by actual streams for smooth ribbons
# Plot with ggplot2
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
# Plot with confidence and prediction intervals as lines (no shaded ribbons)
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(
title = "Actual vs Predicted Streams with Confidence and Prediction Interval Lines",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
ggplot(plot_data, aes(x = in_spotify_playlists, y = streams)) +
geom_point() +
geom_line(aes(y = predicted_streams), color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Model Fit with Confidence Interval",
y = "Streams", x = "Spotify Playlists") +
theme_minimal()
# Load required packages
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Predict Spotify Song Streams"),
tabsetPanel(
tabPanel("Visualize by Mode",
sidebarLayout(
sidebarPanel(
checkboxGroupInput("selected_modes", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("modePlot")
)
)
),
tabPanel("Predict Streams",
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(final_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
output$modePlot <- renderPlot({
req(input$selected_modes)
filtered_data <- subset(spotify, mode %in% input$selected_modes)
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:7921
g
ggplot(plot_data, aes(x = predicted_streams, y = in_spotify_playlists)) +
geom_point(aes(y = streams), alpha = 0.5) +
geom_line() +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Prediction with Confidence Intervals")
plot(lm_model)
plot(lm_model, which = 5)